HR ANALYTICS EMPLOYEE ATTRITION & PERFORMANCE

ABHIJITH DAMERUPPALA

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette('muted')[0:5]
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier
In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import sklearn.metrics as metrics

from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier

from sklearn.decomposition import PCA
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.decomposition import FactorAnalysis
In [5]:
data = pd.read_csv("./WA_Fn-UseC_-HR-Employee-Attrition.csv")
In [6]:
data.head()
Out[6]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [7]:
data.columns
Out[7]:
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
In [8]:
data.shape
Out[8]:
(1470, 35)
In [9]:
data.info
Out[9]:
<bound method DataFrame.info of       Age Attrition     BusinessTravel  DailyRate              Department  \
0      41       Yes      Travel_Rarely       1102                   Sales   
1      49        No  Travel_Frequently        279  Research & Development   
2      37       Yes      Travel_Rarely       1373  Research & Development   
3      33        No  Travel_Frequently       1392  Research & Development   
4      27        No      Travel_Rarely        591  Research & Development   
...   ...       ...                ...        ...                     ...   
1465   36        No  Travel_Frequently        884  Research & Development   
1466   39        No      Travel_Rarely        613  Research & Development   
1467   27        No      Travel_Rarely        155  Research & Development   
1468   49        No  Travel_Frequently       1023                   Sales   
1469   34        No      Travel_Rarely        628  Research & Development   

      DistanceFromHome  Education EducationField  EmployeeCount  \
0                    1          2  Life Sciences              1   
1                    8          1  Life Sciences              1   
2                    2          2          Other              1   
3                    3          4  Life Sciences              1   
4                    2          1        Medical              1   
...                ...        ...            ...            ...   
1465                23          2        Medical              1   
1466                 6          1        Medical              1   
1467                 4          3  Life Sciences              1   
1468                 2          3        Medical              1   
1469                 8          3        Medical              1   

      EmployeeNumber  ...  RelationshipSatisfaction StandardHours  \
0                  1  ...                         1            80   
1                  2  ...                         4            80   
2                  4  ...                         2            80   
3                  5  ...                         3            80   
4                  7  ...                         4            80   
...              ...  ...                       ...           ...   
1465            2061  ...                         3            80   
1466            2062  ...                         1            80   
1467            2064  ...                         2            80   
1468            2065  ...                         4            80   
1469            2068  ...                         1            80   

      StockOptionLevel  TotalWorkingYears  TrainingTimesLastYear  \
0                    0                  8                      0   
1                    1                 10                      3   
2                    0                  7                      3   
3                    0                  8                      3   
4                    1                  6                      3   
...                ...                ...                    ...   
1465                 1                 17                      3   
1466                 1                  9                      5   
1467                 1                  6                      0   
1468                 0                 17                      3   
1469                 0                  6                      3   

     WorkLifeBalance  YearsAtCompany YearsInCurrentRole  \
0                  1               6                  4   
1                  3              10                  7   
2                  3               0                  0   
3                  3               8                  7   
4                  3               2                  2   
...              ...             ...                ...   
1465               3               5                  2   
1466               3               7                  7   
1467               3               6                  2   
1468               2               9                  6   
1469               4               4                  3   

      YearsSinceLastPromotion  YearsWithCurrManager  
0                           0                     5  
1                           1                     7  
2                           0                     0  
3                           3                     0  
4                           2                     2  
...                       ...                   ...  
1465                        0                     3  
1466                        1                     7  
1467                        0                     3  
1468                        0                     8  
1469                        1                     2  

[1470 rows x 35 columns]>
In [10]:
data.describe()
Out[10]:
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 ... 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000
mean 36.923810 802.485714 9.192517 2.912925 1.0 1024.865306 2.721769 65.891156 2.729932 2.063946 ... 2.712245 80.0 0.793878 11.279592 2.799320 2.761224 7.008163 4.229252 2.187755 4.123129
std 9.135373 403.509100 8.106864 1.024165 0.0 602.024335 1.093082 20.329428 0.711561 1.106940 ... 1.081209 0.0 0.852077 7.780782 1.289271 0.706476 6.126525 3.623137 3.222430 3.568136
min 18.000000 102.000000 1.000000 1.000000 1.0 1.000000 1.000000 30.000000 1.000000 1.000000 ... 1.000000 80.0 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 30.000000 465.000000 2.000000 2.000000 1.0 491.250000 2.000000 48.000000 2.000000 1.000000 ... 2.000000 80.0 0.000000 6.000000 2.000000 2.000000 3.000000 2.000000 0.000000 2.000000
50% 36.000000 802.000000 7.000000 3.000000 1.0 1020.500000 3.000000 66.000000 3.000000 2.000000 ... 3.000000 80.0 1.000000 10.000000 3.000000 3.000000 5.000000 3.000000 1.000000 3.000000
75% 43.000000 1157.000000 14.000000 4.000000 1.0 1555.750000 4.000000 83.750000 3.000000 3.000000 ... 4.000000 80.0 1.000000 15.000000 3.000000 3.000000 9.000000 7.000000 3.000000 7.000000
max 60.000000 1499.000000 29.000000 5.000000 1.0 2068.000000 4.000000 100.000000 4.000000 5.000000 ... 4.000000 80.0 3.000000 40.000000 6.000000 4.000000 40.000000 18.000000 15.000000 17.000000

8 rows × 26 columns

In [11]:
data.isna().all()
Out[11]:
Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesLastYear       False
WorkLifeBalance             False
YearsAtCompany              False
YearsInCurrentRole          False
YearsSinceLastPromotion     False
YearsWithCurrManager        False
dtype: bool
In [12]:
numericData = data.select_dtypes(include = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
In [10]:
plt.subplots(figsize=(13, 8))

sns.heatmap(numericData)


plt.show()
In [13]:
numericData.corr()
Out[13]:
Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
Age 1.000000 0.010661 -0.001686 0.208034 NaN -0.010145 0.010146 0.024287 0.029820 0.509604 ... 0.053535 NaN 0.037510 0.680381 -0.019621 -0.021490 0.311309 0.212901 0.216513 0.202089
DailyRate 0.010661 1.000000 -0.004985 -0.016806 NaN -0.050990 0.018355 0.023381 0.046135 0.002966 ... 0.007846 NaN 0.042143 0.014515 0.002453 -0.037848 -0.034055 0.009932 -0.033229 -0.026363
DistanceFromHome -0.001686 -0.004985 1.000000 0.021042 NaN 0.032916 -0.016075 0.031131 0.008783 0.005303 ... 0.006557 NaN 0.044872 0.004628 -0.036942 -0.026556 0.009508 0.018845 0.010029 0.014406
Education 0.208034 -0.016806 0.021042 1.000000 NaN 0.042070 -0.027128 0.016775 0.042438 0.101589 ... -0.009118 NaN 0.018422 0.148280 -0.025100 0.009819 0.069114 0.060236 0.054254 0.069065
EmployeeCount NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
EmployeeNumber -0.010145 -0.050990 0.032916 0.042070 NaN 1.000000 0.017621 0.035179 -0.006888 -0.018519 ... -0.069861 NaN 0.062227 -0.014365 0.023603 0.010309 -0.011240 -0.008416 -0.009019 -0.009197
EnvironmentSatisfaction 0.010146 0.018355 -0.016075 -0.027128 NaN 0.017621 1.000000 -0.049857 -0.008278 0.001212 ... 0.007665 NaN 0.003432 -0.002693 -0.019359 0.027627 0.001458 0.018007 0.016194 -0.004999
HourlyRate 0.024287 0.023381 0.031131 0.016775 NaN 0.035179 -0.049857 1.000000 0.042861 -0.027853 ... 0.001330 NaN 0.050263 -0.002334 -0.008548 -0.004607 -0.019582 -0.024106 -0.026716 -0.020123
JobInvolvement 0.029820 0.046135 0.008783 0.042438 NaN -0.006888 -0.008278 0.042861 1.000000 -0.012630 ... 0.034297 NaN 0.021523 -0.005533 -0.015338 -0.014617 -0.021355 0.008717 -0.024184 0.025976
JobLevel 0.509604 0.002966 0.005303 0.101589 NaN -0.018519 0.001212 -0.027853 -0.012630 1.000000 ... 0.021642 NaN 0.013984 0.782208 -0.018191 0.037818 0.534739 0.389447 0.353885 0.375281
JobSatisfaction -0.004892 0.030571 -0.003669 -0.011296 NaN -0.046247 -0.006784 -0.071335 -0.021476 -0.001944 ... -0.012454 NaN 0.010690 -0.020185 -0.005779 -0.019459 -0.003803 -0.002305 -0.018214 -0.027656
MonthlyIncome 0.497855 0.007707 -0.017014 0.094961 NaN -0.014829 -0.006259 -0.015794 -0.015271 0.950300 ... 0.025873 NaN 0.005408 0.772893 -0.021736 0.030683 0.514285 0.363818 0.344978 0.344079
MonthlyRate 0.028051 -0.032182 0.027473 -0.026084 NaN 0.012648 0.037600 -0.015297 -0.016322 0.039563 ... -0.004085 NaN -0.034323 0.026442 0.001467 0.007963 -0.023655 -0.012815 0.001567 -0.036746
NumCompaniesWorked 0.299635 0.038153 -0.029251 0.126317 NaN -0.001251 0.012594 0.022157 0.015012 0.142501 ... 0.052733 NaN 0.030075 0.237639 -0.066054 -0.008366 -0.118421 -0.090754 -0.036814 -0.110319
PercentSalaryHike 0.003634 0.022704 0.040235 -0.011111 NaN -0.012944 -0.031701 -0.009062 -0.017205 -0.034730 ... -0.040490 NaN 0.007528 -0.020608 -0.005221 -0.003280 -0.035991 -0.001520 -0.022154 -0.011985
PerformanceRating 0.001904 0.000473 0.027110 -0.024539 NaN -0.020359 -0.029548 -0.002172 -0.029071 -0.021222 ... -0.031351 NaN 0.003506 0.006744 -0.015579 0.002572 0.003435 0.034986 0.017896 0.022827
RelationshipSatisfaction 0.053535 0.007846 0.006557 -0.009118 NaN -0.069861 0.007665 0.001330 0.034297 0.021642 ... 1.000000 NaN -0.045952 0.024054 0.002497 0.019604 0.019367 -0.015123 0.033493 -0.000867
StandardHours NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
StockOptionLevel 0.037510 0.042143 0.044872 0.018422 NaN 0.062227 0.003432 0.050263 0.021523 0.013984 ... -0.045952 NaN 1.000000 0.010136 0.011274 0.004129 0.015058 0.050818 0.014352 0.024698
TotalWorkingYears 0.680381 0.014515 0.004628 0.148280 NaN -0.014365 -0.002693 -0.002334 -0.005533 0.782208 ... 0.024054 NaN 0.010136 1.000000 -0.035662 0.001008 0.628133 0.460365 0.404858 0.459188
TrainingTimesLastYear -0.019621 0.002453 -0.036942 -0.025100 NaN 0.023603 -0.019359 -0.008548 -0.015338 -0.018191 ... 0.002497 NaN 0.011274 -0.035662 1.000000 0.028072 0.003569 -0.005738 -0.002067 -0.004096
WorkLifeBalance -0.021490 -0.037848 -0.026556 0.009819 NaN 0.010309 0.027627 -0.004607 -0.014617 0.037818 ... 0.019604 NaN 0.004129 0.001008 0.028072 1.000000 0.012089 0.049856 0.008941 0.002759
YearsAtCompany 0.311309 -0.034055 0.009508 0.069114 NaN -0.011240 0.001458 -0.019582 -0.021355 0.534739 ... 0.019367 NaN 0.015058 0.628133 0.003569 0.012089 1.000000 0.758754 0.618409 0.769212
YearsInCurrentRole 0.212901 0.009932 0.018845 0.060236 NaN -0.008416 0.018007 -0.024106 0.008717 0.389447 ... -0.015123 NaN 0.050818 0.460365 -0.005738 0.049856 0.758754 1.000000 0.548056 0.714365
YearsSinceLastPromotion 0.216513 -0.033229 0.010029 0.054254 NaN -0.009019 0.016194 -0.026716 -0.024184 0.353885 ... 0.033493 NaN 0.014352 0.404858 -0.002067 0.008941 0.618409 0.548056 1.000000 0.510224
YearsWithCurrManager 0.202089 -0.026363 0.014406 0.069065 NaN -0.009197 -0.004999 -0.020123 0.025976 0.375281 ... -0.000867 NaN 0.024698 0.459188 -0.004096 0.002759 0.769212 0.714365 0.510224 1.000000

26 rows × 26 columns

Lets now visualize the distribution of employes whio left the company vs those who stayed

In [14]:
plt.title("Distribution of Attrition in the dataset")

plt.pie(data.Attrition.value_counts(), 
        labels = data.Attrition.value_counts(),
       colors = color)

plt.legend(data.Attrition.value_counts().index)
Out[14]:
<matplotlib.legend.Legend at 0x26e69406610>
In [13]:
sns.pairplot(numericData)
Out[13]:
<seaborn.axisgrid.PairGrid at 0x1d1d42ba1d0>
In [15]:
attritionYes = data[data.Attrition == 'Yes']
In [16]:
plt.title("Distribution of employees' age who were attritioned")
sns.histplot(attritionYes.Age, color = 'skyblue')
Out[16]:
<AxesSubplot: title={'center': "Distribution of employees' age who were attritioned"}, xlabel='Age', ylabel='Count'>
In [17]:
plt.title("Distribution of employees' Monthly Income who were attritioned")
sns.histplot(attritionYes.MonthlyIncome, color = 'lightgreen')
Out[17]:
<AxesSubplot: title={'center': "Distribution of employees' Monthly Income who were attritioned"}, xlabel='MonthlyIncome', ylabel='Count'>
In [18]:
plt.title("Distribution of employees' Percent of salary hike who were attritioned")

sns.histplot(attritionYes.PercentSalaryHike, color = 'orange')
Out[18]:
<AxesSubplot: title={'center': "Distribution of employees' Percent of salary hike who were attritioned"}, xlabel='PercentSalaryHike', ylabel='Count'>
In [19]:
plt.title("Distribution of employees' years at the company before they were attritioned")

sns.histplot(attritionYes.YearsAtCompany, color = 'purple')
Out[19]:
<AxesSubplot: title={'center': "Distribution of employees' years at the company before they were attritioned"}, xlabel='YearsAtCompany', ylabel='Count'>
In [20]:
plt.title("Scatter plot of employees' age v/s Monthly income")

sns.scatterplot(x = data.MonthlyIncome, y = data.Age, hue = data.Attrition)
Out[20]:
<AxesSubplot: title={'center': "Scatter plot of employees' age v/s Monthly income"}, xlabel='MonthlyIncome', ylabel='Age'>
In [22]:
usefulData = data[['YearsAtCompany', 'PercentSalaryHike', 'MonthlyIncome', 'Age' ,'NumCompaniesWorked']]
In [21]:
sns.pairplot(usefulData)
Out[21]:
<seaborn.axisgrid.PairGrid at 0x1d1fb8f7890>
In [23]:
usefulData.corr()
Out[23]:
YearsAtCompany PercentSalaryHike MonthlyIncome Age NumCompaniesWorked
YearsAtCompany 1.000000 -0.035991 0.514285 0.311309 -0.118421
PercentSalaryHike -0.035991 1.000000 -0.027269 0.003634 -0.010238
MonthlyIncome 0.514285 -0.027269 1.000000 0.497855 0.149515
Age 0.311309 0.003634 0.497855 1.000000 0.299635
NumCompaniesWorked -0.118421 -0.010238 0.149515 0.299635 1.000000
In [24]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
In [ ]:
 
In [ ]:
 
In [25]:
data.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)
In [26]:
data.shape
Out[26]:
(1470, 31)
In [ ]:
 

Label Encoding¶

In [27]:
categorical_col = []
for column in data.columns:
    if data[column].dtype == object and len(data[column].unique()) <= 50:
        categorical_col.append(column)
In [28]:
categorical_col
Out[28]:
['Attrition',
 'BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']
In [29]:
data['Attrition'] = data.Attrition.astype("category").cat.codes
categorical_col.remove('Attrition')
In [30]:
# categorical_col.remove('BusinessTravel')
In [31]:
label = LabelEncoder()

for column in categorical_col:
    data[column] = label.fit_transform(data[column])
In [32]:
data.head()
Out[32]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender ... PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1 2 1102 2 1 2 1 2 0 ... 3 1 0 8 0 1 6 4 0 5
1 49 0 1 279 1 8 1 1 3 1 ... 4 4 1 10 3 3 10 7 1 7
2 37 1 2 1373 1 2 2 4 4 1 ... 3 2 0 7 3 3 0 0 0 0
3 33 0 1 1392 1 3 4 1 4 0 ... 3 3 0 8 3 3 8 7 3 0
4 27 0 2 591 1 2 1 3 1 1 ... 3 4 1 6 3 3 2 2 2 2

5 rows × 31 columns

Splitting the dataset¶

In [33]:
X = data.drop(['Attrition'], axis = 1)
y = data['Attrition']
In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [35]:
y_train
Out[35]:
995     0
727     0
68      0
1420    0
692     0
       ..
1084    0
582     0
1356    0
1438    1
6       0
Name: Attrition, Length: 1176, dtype: int8
In [ ]:
 
In [36]:
plt.title("Distribution of Attrition in the dataset")

plt.pie(data.Attrition.value_counts(), 
        labels = data.Attrition.value_counts(),
       colors = color)

plt.legend(data.Attrition.value_counts().index)
Out[36]:
<matplotlib.legend.Legend at 0x26e6ba61590>

Wealready know tat the dataset is imbalanced, so lets apply SMOTE technique to make it balanced.

SMOTE¶

In [37]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
In [38]:
encoder = OneHotEncoder()
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.fit_transform(X_test)

X_train_numeric = X_train.select_dtypes(include=['float64', 'int64'])
X_train_categorical = X_train.select_dtypes(include=['object'])

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_numeric, y_train)
In [39]:
X_test_encoded
Out[39]:
<294x1198 sparse matrix of type '<class 'numpy.float64'>'
	with 8820 stored elements in Compressed Sparse Row format>

Logistic Regression¶

In [40]:
logistic_model = LogisticRegression(solver='liblinear')
logistic_model.fit(X_train_resampled, y_train_resampled)

X_test = X_test[X_train_resampled.columns]
y_pred = logistic_model.predict(X_test)
In [41]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.7482993197278912
Confusion Matrix:
 [[197  54]
 [ 20  23]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.78      0.84       251
           1       0.30      0.53      0.38        43

    accuracy                           0.75       294
   macro avg       0.60      0.66      0.61       294
weighted avg       0.82      0.75      0.77       294

In [42]:
logistic_auc = roc_auc_score(y_test, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Logistic Regression (AUC = %0.2f)' % logistic_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()

Grid-Search Random Forest Classifier¶

In [43]:
from sklearn.model_selection import train_test_split, GridSearchCV
In [44]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42),  param_grid_rf, cv=5, n_jobs=-1)
grid_search_rf.fit(X_train_resampled, y_train_resampled)
Out[44]:
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]})
RandomForestClassifier(random_state=42)
RandomForestClassifier(random_state=42)
In [45]:
# X_test = X_test[X_train_resampled.columns]
y_pred = grid_search_rf.predict(X_test)
In [ ]:
 
In [46]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.8299319727891157
Confusion Matrix:
 [[231  20]
 [ 30  13]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.92      0.90       251
           1       0.39      0.30      0.34        43

    accuracy                           0.83       294
   macro avg       0.64      0.61      0.62       294
weighted avg       0.81      0.83      0.82       294

In [47]:
random_forest_auc = roc_auc_score(y_test, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Grid Search-Random Forest Classifier (AUC = %0.2f)' % random_forest_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 

XGBoost Classifier¶

In [48]:
xgb = XGBClassifier(random_state=7)
In [49]:
xgb.fit(X_train_resampled, y_train_resampled)
Out[49]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=7, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=7, ...)
In [50]:
y_pred = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.8197278911564626
Confusion Matrix:
 [[225  26]
 [ 27  16]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.90      0.89       251
           1       0.38      0.37      0.38        43

    accuracy                           0.82       294
   macro avg       0.64      0.63      0.64       294
weighted avg       0.82      0.82      0.82       294

In [51]:
random_forest_auc = roc_auc_score(y_test, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='XGBoost Classifier (AUC = %0.2f)' % random_forest_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
In [ ]:
 

Instead of using the SMOTE technique, I will resample the dataset myself.

In [52]:
data.head()
Out[52]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EnvironmentSatisfaction Gender ... PerformanceRating RelationshipSatisfaction StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1 2 1102 2 1 2 1 2 0 ... 3 1 0 8 0 1 6 4 0 5
1 49 0 1 279 1 8 1 1 3 1 ... 4 4 1 10 3 3 10 7 1 7
2 37 1 2 1373 1 2 2 4 4 1 ... 3 2 0 7 3 3 0 0 0 0
3 33 0 1 1392 1 3 4 1 4 0 ... 3 3 0 8 3 3 8 7 3 0
4 27 0 2 591 1 2 1 3 1 1 ... 3 4 1 6 3 3 2 2 2 2

5 rows × 31 columns

Resampling the dataset again¶

In [53]:
from sklearn.utils import resample
In [54]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)
In [55]:
model = RandomForestClassifier()
model.fit(xtrain, ytrain)

ypred = model.predict(xtest)
accuracy = accuracy_score(ytest, ypred)

print(f'Accuracy: {accuracy}')
Accuracy: 0.8639455782312925
In [56]:
X, y = resample(X, y, replace=True, stratify=y)
In [57]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)
In [ ]:
 
In [ ]:
 

New Random Forest Classifier¶

In [58]:
y_pred = model.predict(xtest)
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))
Accuracy: 0.9546485260770975
Confusion Matrix:
 [[378   2]
 [ 18  43]]
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       380
           1       0.96      0.70      0.81        61

    accuracy                           0.95       441
   macro avg       0.96      0.85      0.89       441
weighted avg       0.95      0.95      0.95       441

In [59]:
new_rf_auc = roc_auc_score(ytest, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='New Random Forest Classifier (AUC = %0.2f)' % new_rf_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
In [ ]:
 

New XGBoost¶

In [60]:
xgb.fit(xtrain, ytrain)
Out[60]:
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=7, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=7, ...)
In [61]:
y_pred = xgb.predict(xtest)
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))
Accuracy: 0.9591836734693877
Confusion Matrix:
 [[377   3]
 [ 15  46]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98       380
           1       0.94      0.75      0.84        61

    accuracy                           0.96       441
   macro avg       0.95      0.87      0.91       441
weighted avg       0.96      0.96      0.96       441

In [62]:
new_xgb_auc = roc_auc_score(ytest, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='New XGBosot Classifier (AUC = %0.2f)' % new_rf_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
In [ ]:
 

ADA Boost¶

In [64]:
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
model = abc.fit(xtrain, ytrain)
In [65]:
y_pred = model.predict(xtest)
print("Accuracy:", accuracy_score(ytest, y_pred))
print("Confusion Matrix:\n", confusion_matrix(ytest, y_pred))
print("Classification Report:\n", classification_report(ytest, y_pred))
Accuracy: 0.9229024943310657
Confusion Matrix:
 [[372   8]
 [ 26  35]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.96       380
           1       0.81      0.57      0.67        61

    accuracy                           0.92       441
   macro avg       0.87      0.78      0.81       441
weighted avg       0.92      0.92      0.92       441

In [67]:
abc_auc = roc_auc_score(ytest, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='ADA Boost Classifier (AUC = %0.2f)' % abc_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
In [ ]:
 

Factor Analysis¶

In [111]:
X_std = StandardScaler().fit_transform(X)

# Initialize factor analysis object
fa = FactorAnalysis(n_components=3, random_state=0)

fa.fit(X_std)

loadings = fa.components_.T

feature_names = X.columns
factor_names = [f"Factor {i+1}" for i in range(n_factors)]
loadings_df = pd.DataFrame(loadings, columns=factor_names, index=feature_names)

# Print the factor loadings
print("Factor Loadings:\n", loadings_df)
Factor Loadings:
                           Factor 1  Factor 2  Factor 3
Age                       0.530641 -0.076421  0.047642
BusinessTravel            0.035199 -0.013354 -0.048325
DailyRate                 0.014752 -0.056018  0.043069
Department                0.051011 -0.006220  0.033339
DistanceFromHome         -0.015279  0.018852  0.042615
Education                 0.096562  0.008010  0.062439
EducationField           -0.018389  0.034419  0.003113
EnvironmentSatisfaction   0.001762  0.028274 -0.021741
Gender                   -0.029964 -0.005723 -0.041460
HourlyRate               -0.040842 -0.023192 -0.021789
JobInvolvement            0.005951  0.000549 -0.029541
JobLevel                  0.961464 -0.157041  0.003293
JobRole                  -0.082364 -0.034650  0.028849
JobSatisfaction           0.007491  0.013298 -0.015579
MaritalStatus            -0.067451  0.010377 -0.029364
MonthlyIncome             0.958079 -0.187924  0.011524
MonthlyRate               0.038677 -0.049803 -0.003254
NumCompaniesWorked        0.140146 -0.243126 -0.001987
OverTime                 -0.035255 -0.022218  0.021213
PercentSalaryHike        -0.063875 -0.019387  0.912270
PerformanceRating        -0.033014  0.025016  0.866429
RelationshipSatisfaction  0.088917  0.045270 -0.061777
StockOptionLevel         -0.042982  0.026418  0.035713
TotalWorkingYears         0.829761  0.087802  0.035045
TrainingTimesLastYear     0.029342  0.031088 -0.020535
WorkLifeBalance           0.056407 -0.013753 -0.001397
YearsAtCompany            0.662033  0.675190 -0.013795
YearsInCurrentRole        0.517017  0.638249  0.043816
YearsSinceLastPromotion   0.466992  0.466657  0.001490
YearsWithCurrManager      0.495566  0.660836  0.036522
In [115]:
important_features = {}
for factor in loadings_df.columns:
    important_features[factor] = loadings_df.index[loadings_df[factor].abs() > 0.4].tolist()

# Printing the important features for each factor
print("Important Features:\n", important_features)
Important Features:
 {'Factor 1': ['Age', 'JobLevel', 'MonthlyIncome', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], 'Factor 2': ['YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager'], 'Factor 3': ['PercentSalaryHike', 'PerformanceRating']}
In [116]:
plt.figure(figsize=(12, 10))
sns.heatmap(loadings_df, cmap="Blues", annot=True, cbar=True, center=0)
plt.title('Factor Loadings Heatmap')
plt.xlabel('Factors')
plt.ylabel('Features')
plt.show()
In [118]:
important_features_list = list(set([feature for features in important_features.values() for feature in features]))

# Make sure that 'X' is your original DataFrame that contains all the features
# Select only the important features from 'X'
X_important = X[important_features_list]
In [ ]:
 
In [ ]:
 

Lets split the data using just the important features

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X_important, y, test_size=0.3, random_state=0)
In [126]:
# Initialize the prediction algorithm
model = RandomForestClassifier(random_state=0)

# Train the model
model.fit(X_train, y_train)
Out[126]:
RandomForestClassifier(random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(random_state=0)
In [127]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.9047619047619048
Confusion Matrix:
 [[358  13]
 [ 29  41]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94       371
           1       0.76      0.59      0.66        70

    accuracy                           0.90       441
   macro avg       0.84      0.78      0.80       441
weighted avg       0.90      0.90      0.90       441

In [129]:
rfc_auc = roc_auc_score(ytest, y_pred)

# Generate ROC curve
fpr, tpr, _ = roc_curve(ytest, y_pred)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label='Random Forest Classifier (AUC = %0.2f)' % abc_auc, color = 'purple')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve of Random Forest Classifier With Important Features')
plt.legend()
plt.show()
In [ ]:
 

PCA¶

In [98]:
X_std = StandardScaler().fit_transform(X)

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X_std)

# Let's try to identify 3 clusters for this example
kmeans = KMeans(n_clusters=3, random_state=0).fit(principalComponents)

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])

pca_df['Cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(8, 8))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Cluster', data=pca_df, palette='viridis')
plt.title('Clusters identified by PCA-reduced data')
plt.show()
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

Finding the number of Clusters¶

In [125]:
from sklearn.metrics import silhouette_score

# We will use the silhouette score to find the optimal number of clusters. We will test for 2 to 6 clusters.
silhouette_scores = []
range_n_clusters = list(range(2, 7))

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(principalComponents)
    cluster_labels = kmeans.labels_
    
    # Calculate silhouette score and append to list
    silhouette_avg = silhouette_score(principalComponents, cluster_labels)
    silhouette_scores.append(silhouette_avg)

# Plotting these silhouette scores
plt.figure(figsize=(10, 5))
sns.lineplot(x=range_n_clusters, y=silhouette_scores, marker='o')
plt.title('Silhouette Scores for Various Numbers of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.xticks(range_n_clusters)
plt.show()
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)

Performing K Means with 2 clusters¶

In [87]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(principalComponents)

# Create a DataFrame for the PCA results
pca_df = pd.DataFrame(data=principalComponents, columns=['Principal Component 1', 'Principal Component 2'])

# Add the cluster labels
pca_df['Cluster'] = kmeans.labels_

# Plot the clusters
plt.figure(figsize=(8, 8))
sns.scatterplot(x='Principal Component 1', y='Principal Component 2', hue='Cluster', data=pca_df, palette='viridis')
plt.title('Clusters identified by PCA-reduced data')
plt.show()
C:\Python311\Lib\site-packages\sklearn\cluster\_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
  super()._check_params_vs_input(X, default_n_init=10)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: